Exploring performance#
In this demo, we …
## Installing (if not) and importing compiam to the project
import importlib.util
if importlib.util.find_spec('compiam') is None:
%pip install -U compiam==0.4.1
if importlib.util.find_spec('essentia') is None:
%pip install essentia
if importlib.util.find_spec('torch') is None:
%pip install "torch==1.13"
if importlib.util.find_spec('tensorflow') is None:
%pip install "tensorflow==2.15.0" "keras<3"
import compiam
import essentia.standard as estd
# Import extras and supress warnings to keep the tutorial clean
import os
import gdown
import zipfile
import numpy as np
import IPython.display as ipd
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
AUDIO_PATH = os.path.join("..", "audio", "demos")
ARTIST = "dr-brindha-manickavasakan"
[ INFO ] MusicExtractorSVM: no classifier models were configured by default
We will work on a concert led by Dr. Brindha Manickavasakan, a well-known Carnatic music performer and Doctor which has been very much involved in our research efforts. This concert happened within the December Season 2023 in Chennai, India, at the well-known Arkay Convention Centre. Please note this recording in particular is part of the newly published Saraga Audiovisual Dataset (A. Shankar et al., 2024), which will be soon available for access through mirdata and compIAM.
For now, we will download this particular concert and explore a given rendition.
url = "https://drive.google.com/uc?id=1iR0bfxDLQbH8fEeHU_GFsg2kh7brZ0HZ&export=download"
output = os.path.join(AUDIO_PATH, "dr-brindha-manickavasakan.zip")
gdown.download(url, output, quiet=False)
Once the audio is download, we can extract all the files and remove the .zip file.
# Unzip file
with zipfile.ZipFile(output, 'r') as zip_ref:
zip_ref.extractall(AUDIO_PATH)
# Delete zip file after extraction
os.remove(output)
Loading and visualising the data#
rendition = "Bhavanuta" # Selecting a rendition
We use Essentia to first load the mixture audio of the concert. The function AudioLoader can be used to load an audio signal from a file path and return the signal plus some important technical information about it.
file_path = os.path.join(AUDIO_PATH, ARTIST, rendition, rendition + ".wav")
audio_mix, _, _, _, _, _ = estd.AudioLoader(filename=file_path)()
audio_mix = audio_mix.T # Put channels first
Let’s quickly listed to 30 seconds of this incredible performance!
ipd.Audio(audio_mix[..., :44100*30], rate=44100)
Low level feature extraction#
Tonic Identification#
Music Source Separation#
Small intro goes here?
from compiam import load_model
# This model uses tensorflow in the backend!
separation_model = load_model("separation:cold-diff-sep")
separated_vocals = separation_model.separate(audio_mix)
separated_vocals.shape
(11343744,)
ipd.Audio(separated_vocals[..., :44100*30], rate=separation_model.sample_rate)
For further reference, please visit the music source separation page.
Pitch Extraction#
from compiam import load_model
# This model uses tensorflow in the backend!
# Importing and initializing again a melodia instance
### Salamon et al. 2012
from compiam.melody.pitch_extraction import Melodia
melodia = Melodia()
# Importing also a DL model to extract the melody
### Plaja-Roglans et al. 2023
ftanet_carnatic = load_model("melody:ftanet-carnatic")
Predict the melody using both methods.
melodia_pitch_track = melodia.extract(audio_mix)
ftanet_pitch_track = ftanet_carnatic.predict(
audio_mix,
out_step=melodia_pitch_track[1, 0], # Interpolating to same size
)
[2024-11-29 14:10:31,339] WARNING [compiam.melody.pitch_extraction.melodia.extract:92] Resampling... (input sampling rate is 44100Hz, make sure this is correct)
[2024-11-29 14:10:58,203] WARNING [compiam.melody.pitch_extraction.ftanet_carnatic.predict:306] Resampling... (input sampling rate is assumed 44100Hz, make sure this is correct and change input_sr otherwise)
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 5s 5s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 3s 3s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 4s 4s/step
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 2s 2s/step
Let’s visualize from sec. 4 to sec. 10 of the performance, together with the predicted pitch tracks using both methods.
audio_vis = audio_mix[0, 44100*4:44100*10]
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(15, 12))
D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_vis)), ref=np.max)
img = librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=44100, ax=ax);
ax.set_ylim(0, 2000)
ax.set_xlim(4, 10)
plt.plot(
melodia_pitch_track[:, 0], melodia_pitch_track[:, 1],
color="white", label="Melodia",
)
plt.plot(
ftanet_pitch_track[:, 0], ftanet_pitch_track[:, 1],
color="black",label="FTANet-Carnatic",
)
plt.legend()
plt.show()
For further reference, please visit the pitch extraction page.
Percussion onset detection#
High level feature extraction#
Melodic pattern discovery#
Raga recognition#
from compiam import load_model
# This model uses tensorflow in the backend!
deepsrgm = load_model("melody:deepsrgm")
feat = deepsrgm.get_features(audio_mix)
[2024-11-29 14:14:04,989] WARNING [compiam.melody.raga_recognition.deepsrgm.get_features:246] Resampling... (input sampling rate is assumed 44100Hz, make sure this is correct and change input_sr otherwise)
[2024-11-29 14:14:04,995] INFO [compiam.melody.raga_recognition.deepsrgm.get_features:256] Extracting pitch track using melodia...
[2024-11-29 14:14:04,996] WARNING [compiam.melody.pitch_extraction.melodia.extract:92] Resampling... (input sampling rate is 44100Hz, make sure this is correct)
[2024-11-29 14:14:31,941] INFO [compiam.melody.raga_recognition.deepsrgm.get_features:259] Extracting tonic using multi-pitch approach...
[2024-11-29 14:14:31,942] WARNING [compiam.melody.tonic_identification.tonic_multipitch.extract:72] Resampling... (input sampling rate is 44100Hz, make sure this is correct)
predicted_raga = deepsrgm.predict(feat)
predicted_raga
[2024-11-29 14:14:40,869] INFO [compiam.melody.raga_recognition.deepsrgm.predict:309] Performing prediction for the following 10 ragas: ['Bhairav', 'Madhukauns', 'Mōhanaṁ', 'Hamsadhvāni', 'Varāḷi', 'Dēś', 'Kamās', 'Yaman kalyāṇ', 'Bilahari', 'Ahira bhairav']
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In [17], line 1
----> 1 predicted_raga = deepsrgm.predict(feat)
2 predicted_raga
File /opt/hostedtoolcache/Python/3.11.10/x64/lib/python3.11/site-packages/compiam/melody/raga_recognition/deepsrgm/__init__.py:315, in DEEPSRGM.predict(self, features, threshold, gpu)
309 logger.info(
310 "Performing prediction for the following {} ragas: {}".format(
311 len(list_of_ragas), list_of_ragas
312 )
313 )
314 with torch.no_grad():
--> 315 out = self.model.forward(torch.from_numpy(features).to(self.device).long())
316 preds = torch.argmax(out, axis=-1)
317 majority, _ = torch.mode(preds)
File /opt/hostedtoolcache/Python/3.11.10/x64/lib/python3.11/site-packages/compiam/melody/raga_recognition/deepsrgm/model.py:63, in deepsrgmModel.forward(self, x)
60 def forward(self, x):
61 # batch_size = x.size(0)
62 embeds = self.embeddings(x)
---> 63 out, _ = self.rnn(embeds)
64 # out = self.batchNorm1d(out)
65 out = self.attention_layer(out)
File /opt/hostedtoolcache/Python/3.11.10/x64/lib/python3.11/site-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File /opt/hostedtoolcache/Python/3.11.10/x64/lib/python3.11/site-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File /opt/hostedtoolcache/Python/3.11.10/x64/lib/python3.11/site-packages/torch/nn/modules/rnn.py:1123, in LSTM.forward(self, input, hx)
1120 hx = self.permute_hidden(hx, sorted_indices)
1122 if batch_sizes is None:
-> 1123 result = _VF.lstm(
1124 input,
1125 hx,
1126 self._flat_weights,
1127 self.bias,
1128 self.num_layers,
1129 self.dropout,
1130 self.training,
1131 self.bidirectional,
1132 self.batch_first,
1133 )
1134 else:
1135 result = _VF.lstm(
1136 input,
1137 batch_sizes,
(...)
1144 self.bidirectional,
1145 )
RuntimeError: [enforce fail at alloc_cpu.cpp:117] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 24980501592 bytes. Error code 12 (Cannot allocate memory)
deepsrgm.mapping[predicted_raga]
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In [18], line 1
----> 1 deepsrgm.mapping[predicted_raga]
NameError: name 'predicted_raga' is not defined